In [48]:
import pandas as pd
In [71]:
df = pd.read_csv("AB_NYC_2019.csv")
In [50]:
df.head()
Out[50]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 19-10-2018 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 21-05-2019 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 05-07-2019 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 19-11-2018 0.10 1 0
In [75]:
df["id"]=df["id"].astype(str)
df["host_id"]=df["host_id"].astype(str)
df["latitude"]=df["latitude"].astype(str)
df["longitude"]=df["longitude"].astype(str)

How does the data look mathematically?

In [52]:
df.describe()
Out[52]:
price minimum_nights number_of_reviews reviews_per_month calculated_host_listings_count availability_365
count 48906.000000 48906.000000 48906.000000 38854.000000 48906.000000 48906.000000
mean 152.711324 7.031612 23.300454 1.373151 7.142702 112.782031
std 240.128713 20.512489 44.607175 1.680270 32.948926 131.620370
min 0.000000 1.000000 0.000000 0.010000 1.000000 0.000000
25% 69.000000 1.000000 1.000000 0.190000 1.000000 0.000000
50% 106.000000 3.000000 5.000000 0.720000 1.000000 45.000000
75% 175.000000 5.000000 24.000000 2.020000 2.000000 227.000000
max 10000.000000 1250.000000 629.000000 58.500000 327.000000 365.000000

range of minimum nights for listings is 1 and 1250

Categorical Data¶

In [53]:
df.nunique()
Out[53]:
id                                48895
name                              47896
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64
In [54]:
df.columns
Out[54]:
Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')
In [55]:
df["room_type"].value_counts()
Out[55]:
room_type
Entire home/apt    25414
Private room       22332
Shared room         1160
Name: count, dtype: int64
In [56]:
df["room_type"].value_counts(normalize = True)
Out[56]:
room_type
Entire home/apt    0.519650
Private room       0.456631
Shared room        0.023719
Name: proportion, dtype: float64
In [57]:
# df["neighbourhood_group"].value_counts(normalize=True)*100
percentage_counts = df["neighbourhood_group"].value_counts(normalize=True) * 100
print(percentage_counts.map("{:.3f}%".format))
neighbourhood_group
Manhattan        44.307%
Brooklyn         41.114%
Queens           11.585%
Bronx             2.231%
Staten Island     0.763%
Name: proportion, dtype: object
In [58]:
df["neighbourhood"].value_counts().reset_index().rename(columns = {"count" : "No. of Hotels"})
Out[58]:
neighbourhood No. of Hotels
0 Williamsburg 3921
1 Bedford-Stuyvesant 3715
2 Harlem 2658
3 Bushwick 2465
4 Upper West Side 1974
... ... ...
216 Fort Wadsworth 1
217 Richmondtown 1
218 New Dorp 1
219 Rossville 1
220 Willowbrook 1

221 rows × 2 columns

Numerical Data¶

In [59]:
df["price"].value_counts(bins=5)
Out[59]:
(-10.001, 2000.0]    48820
(2000.0, 4000.0]        54
(4000.0, 6000.0]        16
(6000.0, 8000.0]         9
(8000.0, 10000.0]        7
Name: count, dtype: int64
In [60]:
bins = [-10,0, 50,100, 200,500,800,2000,4000,10000]
df["price"].value_counts(bins = bins)
Out[60]:
(50.0, 100.0]        17373
(100.0, 200.0]       16588
(200.0, 500.0]        7340
(0.0, 50.0]           6550
(500.0, 800.0]         624
(800.0, 2000.0]        334
(2000.0, 4000.0]        54
(4000.0, 10000.0]       32
(-10.001, 0.0]          11
Name: count, dtype: int64

It is mainly helpful in small datasets.

Measures of central tendency¶

In [61]:
df["price"].mean()
Out[61]:
152.71132376395533
In [62]:
df["price"].median()
Out[62]:
106.0
In [63]:
df["price"].std()
Out[63]:
240.1287131622509
In [64]:
df["minimum_nights"].mean()
Out[64]:
7.031611663190611
In [65]:
df["minimum_nights"].median()
Out[65]:
3.0

Measure of Spread¶

In [66]:
df["price"].skew()
Out[66]:
19.120831694826197
In [67]:
df["price"].kurt() ## This tells the height of the price data
Out[67]:
585.7930484394186

How many listings have availability throughout the year (365 days)

In [68]:
df[df["availability_365"]==365].shape[0]
Out[68]:
1295
In [79]:
df.corr(numeric_only=True) 
#The main task of the DataFrame.corr() method is to find the pairwise correlation of all the columns in the DataFrame.
# If any null value is present, it will automatically be excluded. It also ignores non-numeric data type columns from the DataFrame.
Out[79]:
price minimum_nights number_of_reviews reviews_per_month calculated_host_listings_count availability_365
price 1.000000 0.042771 -0.048014 -0.030608 0.057478 0.081817
minimum_nights 0.042771 1.000000 -0.080093 -0.121772 0.127917 0.144146
number_of_reviews -0.048014 -0.080093 1.000000 0.549291 -0.072375 0.172002
reviews_per_month -0.030608 -0.121772 0.549291 1.000000 -0.009414 0.185818
calculated_host_listings_count 0.057478 0.127917 -0.072375 -0.009414 1.000000 0.225680
availability_365 0.081817 0.144146 0.172002 0.185818 0.225680 1.000000
In [ ]: